library(tidyverse) # Data transformation and visualization
library(scales) # Enables percentage scale for ggplot graphs
library(rpart) # Decision Trees: the CART algorithm
library(C50) # Another implementation of decision trees in R: C5.0
library(party) # Another tree-based algorithm: conditional inference trees
library(rpart.plot) # Nice visualization of rpart Decision Trees
library(partykit) # Visualization of Conditional Inference and C5.0 Trees
library(mlr) # MAchine learning framework for R. Here we are using model accuracy metrics provided by the package.
d <- readRDS("turnover.RDS")
# Creating training and test samples
set.seed(1234)
train_indices <- sample(1:nrow(d), size = 2000)
d_train <- d[train_indices, ]
d_test <- d[-train_indices, ]
# What's inside the training sample
glimpse(d_train)
## Observations: 2,000
## Variables: 10
## $ satisfaction_level <dbl> 9.9, 7.2, 7.0, 4.2, 5.9, 8.2, 7.2, 4.4, ...
## $ last_evaluation <dbl> 6.8, 5.3, 5.3, 4.9, 7.9, 9.7, 8.8, 5.4, ...
## $ number_project <dbl> 5, 3, 2, 2, 3, 5, 3, 2, 5, 5, 2, 5, 6, 3...
## $ average_monthly_hours <dbl> 264, 179, 274, 152, 217, 263, 224, 139, ...
## $ time_in_company <dbl> 10, 3, 4, 3, 4, 5, 3, 3, 3, 4, 3, 2, 4, ...
## $ work_accident <fct> yes, no, no, no, no, no, no, no, no, no,...
## $ left <fct> stayed, stayed, left, left, stayed, left...
## $ promotion_last_5years <fct> no, no, no, no, no, no, no, no, no, no, ...
## $ department <fct> support, support, sales, support, produc...
## $ salary <fct> medium, low, low, low, medium, medium, l...
cmap <- c("stayed" = "green", "left" = "red")
ggplot(d, aes(x = satisfaction_level, y = last_evaluation)) +
geom_jitter(aes(colour = left), alpha = 0.5) +
labs(title = "Employee's decision to quit, job satisfaction and performance", colour = "Decision") +
scale_x_continuous(breaks = 1:10) +
scale_colour_manual(values = cmap)
m_rpart2 <- rpart(left ~ satisfaction_level + last_evaluation,
data = d_train)
rpart.plot(m_rpart2, tweak = 1.5,
main = "Decision tree for satisfaction and evaluation")
print(m_rpart2)
## n= 2000
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 2000 470 stayed (0.766 0.234)
## 2) satisfaction_level>=4.6 1454 140 stayed (0.904 0.096) *
## 3) satisfaction_level< 4.6 546 220 left (0.399 0.601)
## 6) satisfaction_level>=1.1 422 200 stayed (0.517 0.483)
## 12) satisfaction_level< 3.5 170 13 stayed (0.924 0.076) *
## 13) satisfaction_level>=3.5 252 61 left (0.242 0.758)
## 26) last_evaluation>=5.8 36 3 stayed (0.917 0.083) *
## 27) last_evaluation< 5.8 216 28 left (0.130 0.870)
## 54) last_evaluation< 4.4 10 0 stayed (1.000 0.000) *
## 55) last_evaluation>=4.4 206 18 left (0.087 0.913) *
## 7) satisfaction_level< 1.1 124 0 left (0.000 1.000) *
Since we have just two quantitative predictors, we can visualize the decision boundary.
# Creating a data grid from 0 to 10 with 0.1 step size
gr <- expand.grid(satisfaction_level = seq(0, 10, by = 0.1),
last_evaluation = seq(3, 10, by = 0.1))
# Adding the predicted class labels and turnover probability
boundary_rpart2 <- gr %>%
mutate(prob = predict(m_rpart2, newdata = gr, type = "prob")[,'left'],
class = predict(m_rpart2, newdata = gr, type = "class"))
ggplot(boundary_rpart2, aes(satisfaction_level, last_evaluation)) +
geom_raster(aes(fill = prob)) +
geom_jitter(aes(x = satisfaction_level,
y = last_evaluation,
colour = left),
data = d_train) +
scale_colour_manual(values = cmap) +
scale_fill_gradient(low = "grey80", high = "grey20")
ggplot(boundary_rpart2, aes(satisfaction_level, last_evaluation)) +
geom_raster(aes(fill = class)) +
geom_jitter(aes(x = satisfaction_level,
y = last_evaluation,
colour = left),
data = d_train) +
scale_fill_grey(start = 0.8, end = 0.2) +
scale_colour_manual(values = cmap)
m_rpart2_gini <- rpart(left ~ satisfaction_level + last_evaluation ,
data = d_train,
parms = list(split = "gini"))
rpart.plot(m_rpart2_gini, tweak = 1.25,
main = "Decision tree based on Gini index")
m_rpart2_information <- rpart(left ~ satisfaction_level + last_evaluation,
data = d_train,
parms = list(split = "information"))
rpart.plot(m_rpart2_information, tweak = 1.25,
main = "Decision tree based on Information Entropy")
The Confusion Matrix cross-tabulates the predicted and the actual class labels.
To create the confusion matrix, we add predicted class labels and use the table() function to cross-tabulate them against the actual class labels.
# Predicted class labels
pr_rpart2_train <- predict(m_rpart2, newdata = d_train, type = "class")
# Confusion matrix
table_rpart2_train <- table(predicted = pr_rpart2_train, actual = d_train$left)
table_rpart2_train
## actual
## predicted stayed left
## stayed 1514 156
## left 18 312
Let’s calculate some model performance metrics on the training data:
options(digits = 3)
# Accuracy
100 * (table_rpart2_train["stayed", "stayed"] +
table_rpart2_train["left", "left"]) / sum(table_rpart2_train)
## [1] 91.3
# Mean Misclassification Error
100 * (table_rpart2_train["stayed", "left"] +
table_rpart2_train["left", "stayed"]) / sum(table_rpart2_train)
## [1] 8.7
The above metrics can be readily calculated with the model metrics provided by the mlr package:
# Accuracy
measureACC(pr_rpart2_train, d_train$left) * 100
## [1] 91.3
# Mean misclassification error
measureMMCE(pr_rpart2_train, d_train$left) * 100
## [1] 8.7
Let’s compare the accuracy on the test sample
pr_rpart2_test <- predict(m_rpart2, newdata = d_test, type = "class")
measureACC(pr_rpart2_test, d_test$left) * 100
## [1] 90.7
table(predicted = pr_rpart2_test, actual = d_test$left)
## actual
## predicted stayed left
## stayed 766 90
## left 3 141
The accuracy is very similar to the accuracy on the training sample. This is good, our model didn’t overfit the training data.
Decision trees are prone to overfitting the training data. It is technically possible to grow a very large tree, that classifies the training data perfectly. However the rules added to the tree will capture not only the true relationships in the data, but also the noise. Such a tree would be unnecessarily complex, and won’t achieve good accuracy on the new data.
The implementation of the tree growing algorithm of R has reasonable defaults that prevent overfitting. To illustrate the point of overfitting, we’ll change the parameters of the algorithm, to relax the early stopping and the tree pruning rules.
m_rpart2_overfit <- rpart(left ~ satisfaction_level + last_evaluation,
data = d_train,
control =
rpart.control(
minsplit = 2, # Minimum node size for splitting
cp = 0.00001, # minimum relative improvement for splitting
maxdepth = 20 # maximum depth of a leaf node
))
rpart.plot(m_rpart2_overfit)
# Adding predicted class labels and turnover probabilities
boundary_rpart2_overfit <- gr %>%
mutate(prob = predict(m_rpart2_overfit, newdata = gr, type = "prob")[,'left'],
class = predict(m_rpart2_overfit, newdata = gr, type = "class"))
# The decision boundary
ggplot(boundary_rpart2_overfit, aes(satisfaction_level, last_evaluation)) +
geom_raster(aes(fill = prob)) +
geom_jitter(aes(x = satisfaction_level,
y = last_evaluation,
colour = left),
data = d_train) +
scale_colour_manual(values = cmap) +
scale_fill_gradient(low = "grey80", high = "grey20")
pr_rpart2_overfit_train <- predict(m_rpart2_overfit, newdata = d_train, type = "class")
pr_rpart2_overfit_test <- predict(m_rpart2_overfit, newdata = d_test, type = "class")
The confusion matrix and accuracy on the training data
table(predicted = pr_rpart2_overfit_train, actual = d_train$left)
## actual
## predicted stayed left
## stayed 1511 32
## left 21 436
measureACC(pr_rpart2_overfit_train, d_train$left) * 100
## [1] 97.4
The confusion matrix and accuracy on the test data
table(predicted = pr_rpart2_overfit_test, actual = d_test$left)
## actual
## predicted stayed left
## stayed 733 66
## left 36 165
measureACC(pr_rpart2_overfit_test, d_test$left) * 100
## [1] 89.8
Plotting model error vs complexity parameter (cp)
plotcp(m_rpart2_overfit)
Tree pruning
m_rpart2_pruned <- prune(m_rpart2_overfit, cp = 0.005)
rpart.plot(m_rpart2_pruned, cex = 0.9)
pr_rpart2_pruned_train <- predict(m_rpart2_pruned, newdata = d_train, type = "class")
pr_rpart2_pruned_test <- predict(m_rpart2_pruned, newdata = d_test, type = "class")
Confusion matrix and accuracy for the pruned tree - training sample
table(predicted = pr_rpart2_pruned_train, actual = d_train$left)
## actual
## predicted stayed left
## stayed 1511 138
## left 21 330
measureACC(pr_rpart2_pruned_train, d_train$left) * 100
## [1] 92
Confusion matrix and accuracy for the pruned tree - test sample
table(predicted = pr_rpart2_pruned_test, actual = d_test$left)
## actual
## predicted stayed left
## stayed 764 82
## left 5 149
measureACC(pr_rpart2_pruned_test, d_test$left) * 100
## [1] 91.3
After pruning, the drop in accuracy on the test sample is much smaller.
# Adding the predicted class labels and turnover probabilities
boundary_rpart2_pruned <- gr %>%
mutate(prob = predict(m_rpart2_pruned, newdata = gr, type = "prob")[,'left'],
class = predict(m_rpart2_pruned, newdata = gr, type = "class"))
# Plotting the decision boundary
ggplot(boundary_rpart2_pruned, aes(satisfaction_level, last_evaluation)) +
geom_raster(aes(fill = prob)) +
geom_jitter(aes(x = satisfaction_level,
y = last_evaluation,
colour = left),
data = d_train) +
scale_colour_manual(values = cmap) +
scale_fill_gradient(low = "grey80", high = "grey20")
Another decision tree algorithm available in R is Conditional Inference Tree. This algorithm is implemented by the party package.
m_ctree2 <- ctree(left ~ satisfaction_level + last_evaluation, data = d_train)
plot(m_ctree2)
# Adding predicted class labels and turnover probability
boundary_ctree2 <- gr %>%
mutate(prob = predict(m_ctree2, newdata = gr, type = "prob")[,'left'],
class = predict(m_ctree2, newdata = gr, type = "response"))
# Plotting the decision boundary
ggplot(boundary_ctree2, aes(satisfaction_level, last_evaluation)) +
geom_raster(aes(fill = prob)) +
geom_jitter(aes(x = satisfaction_level,
y = last_evaluation,
colour = left),
data = d_train) +
scale_colour_manual(values = cmap) +
scale_fill_gradient(low = "grey80", high = "grey20")
pr_ctree2_test <- predict(m_ctree2, newdata = d_test, type = "response")
The confusion matrix and accuracy for ctree - test sample
table(predicted = pr_ctree2_test, actual = d_test$left)
## actual
## predicted stayed left
## stayed 665 85
## left 104 146
measureACC(pr_ctree2_test, d_test$left) * 100
## [1] 81.1
The C5.0 algorithm by Ross Quinlan is a very popular decision tree algorithm, that is implemented in many commercial data mining packages. Recently the source code for this algorithm has been published by the author under an open license, so the algorithm is now available in R.
m_c50 <- C5.0(left ~ satisfaction_level + last_evaluation, data = d_train)
plot(m_c50)
# Adding predicted class labels and turnover probabilities
boundary_c50 <- gr %>%
mutate(prob = predict(m_c50, newdata = gr, type = "prob")[,'left'],
class = predict(m_c50, newdata = gr, type = "class"))
# Plotting the decision boundary
ggplot(boundary_c50, aes(satisfaction_level, last_evaluation)) +
geom_raster(aes(fill = prob)) +
geom_jitter(aes(x = satisfaction_level,
y = last_evaluation,
colour = left),
data = d_train) +
scale_colour_manual(values = cmap) +
scale_fill_gradient(low = "grey80", high = "grey20")
pr_c50_test <- predict(m_c50, newdata = d_test, type = "class")
The confusion matrix and accuracy for C5.0 tree - test sample
table(predicted = pr_c50_test, actual = d_test$left)
## actual
## predicted stayed left
## stayed 763 77
## left 6 154
measureACC(pr_c50_test, d_test$left) * 100
## [1] 91.7
In this example, the performance is very close to the rpart.
The accuracy can be improved by combining the predictions from the ensemble of several models. The C5.0 packages implements one such approach, that is called adaptive boosting. This algorithm grows a sequence of trees, that are trained to correct the errors from the previously built trees. After the ensemble is built, the prediction is made by all the trees in it, and the class label is determined by voting.
The trials parameter controls the number of additional trees to include in the ensemble. This is the maximal number of trees, the actual number can be smaller, when additional trees are proved not to improve performance.
m_c50_boost <- C5.0(left ~ satisfaction_level + last_evaluation,
data = d_train, trials = 5)
m_c50_boost
##
## Call:
## C5.0.formula(formula = left ~ satisfaction_level + last_evaluation, data
## = d_train, trials = 5)
##
## Classification Tree
## Number of samples: 2000
## Number of predictors: 2
##
## Number of boosting iterations: 5
## Average tree size: 7.6
##
## Non-standard options: attempt to group attributes
# Adding predicted class labels and turnover probability
boundary_c50_boost <- gr %>%
mutate(prob = predict(m_c50_boost, newdata = gr, type = "prob")[,'left'],
class = predict(m_c50_boost, newdata = gr, type = "class"))
# The decision boundary
ggplot(boundary_c50_boost, aes(satisfaction_level, last_evaluation)) +
geom_raster(aes(fill = prob)) +
geom_jitter(aes(x = satisfaction_level,
y = last_evaluation,
colour = left),
data = d_train) +
scale_colour_manual(values = cmap) +
scale_fill_gradient(low = "grey80", high = "grey20")
pr_c50_boost_test <- predict(m_c50_boost, newdata = d_test, type = "class")
The confusion matrix and accuracy for the boosted c5.0 ensemble - test data
table(predicted = pr_c50_boost_test, actual = d_test$left)
## actual
## predicted stayed left
## stayed 732 48
## left 37 183
measureACC(pr_c50_boost_test, d_test$left) * 100
## [1] 91.5
The accuracy didn’t improve.
In many cases the negative consequences for different kinds of classification error are not equal. For example, when a fraud detection model misses one fraudulent transaction, the loss is significantly higher compared to the ‘false alarm’ error.
Similarly, when the model looks for customers who can be interested in a product or service, the loss of missing a potential customer is higher compared to the cost of making an additional call or sending an email.
To take into account the misclassification cost, some alrgorithms allow to specify different cost for different kinds of the misclassification error.
In our case we will specify 5 times higher cost for ‘missed opportunity’ type error: it’s more important to determine if an employee is considering leaving the company, and to try to prevent the loss of a valuable employee.
cost_matrix <- matrix(c(0, 5, 1, 0 ), ncol = 2, byrow = TRUE,
dimnames = list("predicted" = c("stayed", "left"),
"actual" = c("stayed", "left")))
cost_matrix
## actual
## predicted stayed left
## stayed 0 5
## left 1 0
Creating the cost-sensitive classifier
m_c50_cost <- C5.0(left ~ satisfaction_level + last_evaluation,
data = d_train, cost = cost_matrix)
plot(m_c50_cost)
# Adding the class labels
boundary_c50_cost <- gr %>%
mutate(class = predict(m_c50_cost, newdata = gr, type = "class"))
# Plotting the decision boundary
ggplot(boundary_c50_cost, aes(satisfaction_level, last_evaluation)) +
geom_raster(aes(fill = class)) +
geom_jitter(aes(x = satisfaction_level,
y = last_evaluation,
colour = left),
data = d_train) +
scale_colour_manual(values = cmap) +
scale_fill_manual(values = c("grey80","grey20"))
pr_c50_cost_test <- predict(m_c50_cost, newdata = d_test, type = "class")
The confusion matrix and the accuracy for the cost-sensitive c5.0 classifier - test sample:
table(predicted = pr_c50_cost_test, actual = d_test$left)
## actual
## predicted stayed left
## stayed 642 14
## left 127 217
measureACC(pr_c50_cost_test, d_test$left) * 100
## [1] 85.9
For small trees, the visualization used by the party package for ctree-produced trees is more concise.
The trees created by rpart can be also visualized this way. To do so, use the partykit::as.party() function.
rpart.plot(m_rpart2, main = "Default visualization for rpart tree")
plot(as.party(m_rpart2), main = "rpart tree visualized by party")
m_rpart_all <- rpart(left ~ ., data = d)
rpart.plot(m_rpart_all, cex = 1)
Confusion matrix for rpart - test sample
pr_rpart_all_test <- predict(m_rpart_all, newdata = d_test, type = "class")
table(predicted = pr_rpart_all_test, actual = d_test$left)
## actual
## predicted stayed left
## stayed 756 15
## left 13 216
Accuracy for rpart - test sample
measureACC(pr_rpart_all_test, d_test$left) * 100
## [1] 97.2
m_ctree_all <- ctree(left ~ ., data = d)
plot(m_ctree_all)
Confusion matrix for ctree - test sample
pr_ctree_all_test <- predict(m_ctree_all, newdata = d_test, type = "response")
table(predicted = pr_ctree_all_test, actual = d_test$left)
## actual
## predicted stayed left
## stayed 726 19
## left 43 212
Accuracy for ctree - test sample
measureACC(pr_ctree_all_test, d_test$left) * 100
## [1] 93.8
m_c50_all <- C5.0(left ~ ., data = d)
plot(m_c50_all)
Confusion matrix for c5.0 - test sample
pr_c50_all_test <- predict(m_c50_all, newdata = d_test, type = "class")
table(predicted = pr_c50_all_test, actual = d_test$left)
## actual
## predicted stayed left
## stayed 762 15
## left 7 216
Accuracy for C5.0 - test sample
measureACC(pr_c50_all_test, d_test$left) * 100
## [1] 97.8
m_c50_boost_all <- C5.0(left ~ ., data = d, trials = 100)
m_c50_boost_all
##
## Call:
## C5.0.formula(formula = left ~ ., data = d, trials = 100)
##
## Classification Tree
## Number of samples: 3000
## Number of predictors: 9
##
## Number of boosting iterations: 100
## Average tree size: 23.8
##
## Non-standard options: attempt to group attributes
Confusion matrix for boosted c5.0 - test sample
pr_c50_boost_all_test <- predict(m_c50_boost_all, newdata = d_test, type = "class")
table(predicted = pr_c50_boost_all_test, actual = d_test$left)
## actual
## predicted stayed left
## stayed 767 10
## left 2 221
Accuracy for boosted C5.0 - test sample
measureACC(pr_c50_boost_all_test, d_test$left) * 100
## [1] 98.8